!WhatsApp Image 2022-05-27 at 3.48.07 PM.jpeg

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns
from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
init_notebook_mode(connected=True)
import plotly.graph_objs as go



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Read Data¶

In [2]:
df = pd.read_csv('Clean_Dataset.csv')
df.head()
Out[2]:
Unnamed: 0 airline flight source_city departure_time stops arrival_time destination_city class duration days_left price
0 0 SpiceJet SG-8709 Delhi Evening zero Night Mumbai Economy 2.17 1 5953
1 1 SpiceJet SG-8157 Delhi Early_Morning zero Morning Mumbai Economy 2.33 1 5953
2 2 AirAsia I5-764 Delhi Early_Morning zero Early_Morning Mumbai Economy 2.17 1 5956
3 3 Vistara UK-995 Delhi Morning zero Afternoon Mumbai Economy 2.25 1 5955
4 4 Vistara UK-963 Delhi Morning zero Morning Mumbai Economy 2.33 1 5955
In [3]:
df1 = pd.read_csv('business.csv')
df2 = pd.read_csv('economy.csv')
In [4]:
df1.head()
Out[4]:
date airline ch_code num_code dep_time from time_taken stop arr_time to price
0 11-02-2022 Air India AI 868 18:00 Delhi 02h 00m non-stop 20:00 Mumbai 25,612
1 11-02-2022 Air India AI 624 19:00 Delhi 02h 15m non-stop 21:15 Mumbai 25,612
2 11-02-2022 Air India AI 531 20:00 Delhi 24h 45m 1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t... 20:45 Mumbai 42,220
3 11-02-2022 Air India AI 839 21:25 Delhi 26h 30m 1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t... 23:55 Mumbai 44,450
4 11-02-2022 Air India AI 544 17:15 Delhi 06h 40m 1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t... 23:55 Mumbai 46,690
In [5]:
df2.head()
Out[5]:
date airline ch_code num_code dep_time from time_taken stop arr_time to price
0 11-02-2022 SpiceJet SG 8709 18:55 Delhi 02h 10m non-stop 21:05 Mumbai 5,953
1 11-02-2022 SpiceJet SG 8157 06:20 Delhi 02h 20m non-stop 08:40 Mumbai 5,953
2 11-02-2022 AirAsia I5 764 04:25 Delhi 02h 10m non-stop 06:35 Mumbai 5,956
3 11-02-2022 Vistara UK 995 10:20 Delhi 02h 15m non-stop 12:35 Mumbai 5,955
4 11-02-2022 Vistara UK 963 08:50 Delhi 02h 20m non-stop 11:10 Mumbai 5,955
In [6]:
df1['price'] = df1['price'].str.replace(',', '')
df1['price'] = df1['price'].astype(int)

df2['price'] = df2['price'].str.replace(',', '')
df2['price'] = df2['price'].astype(int)
In [7]:
df1['stop'] = df1['stop'].str.replace('\n', '')
df1['stop'] = df1['stop'].str.replace('\t', '')
df1['stop'].str.strip()

df2['stop'] = df2['stop'].str.replace('\n', '')
df2['stop'] = df2['stop'].str.replace('\t', '')
df2['stop'].str.strip()
Out[7]:
0         non-stop
1         non-stop
2         non-stop
3         non-stop
4         non-stop
            ...   
206769      1-stop
206770      1-stop
206771      1-stop
206772      1-stop
206773      1-stop
Name: stop, Length: 206774, dtype: object
In [8]:
df1['time_taken'] = df1['time_taken'].apply(lambda x: x.split("h")[0])
df1['time_taken'] = df1['time_taken'].astype(float)

df2['time_taken'] = df2['time_taken'].apply(lambda x: x.split("h")[0])
df2['time_taken'] = df2['time_taken'].astype(float)
In [9]:
df1['time_taken'] = df1['time_taken'].astype(int)
df2['time_taken'] = df2['time_taken'].astype(int)
In [10]:
df1['date'] = pd.to_datetime(df1['date'], format='%d-%m-%Y')
df1.dtypes
Out[10]:
date          datetime64[ns]
airline               object
ch_code               object
num_code               int64
dep_time              object
from                  object
time_taken             int64
stop                  object
arr_time              object
to                    object
price                  int64
dtype: object
In [11]:
df2['date'] = pd.to_datetime(df2['date'], format='%d-%m-%Y')
df2.dtypes
Out[11]:
date          datetime64[ns]
airline               object
ch_code               object
num_code               int64
dep_time              object
from                  object
time_taken             int64
stop                  object
arr_time              object
to                    object
price                  int64
dtype: object
In [12]:
print("Cleand Dataset Shape:", df.shape)
print("Business Dataset Shape:", df1.shape)
print("Economy Dataset Shape:", df2.shape)
Cleand Dataset Shape: (300153, 12)
Business Dataset Shape: (93487, 11)
Economy Dataset Shape: (206774, 11)
In [13]:
df1['class'] = 'Business'
In [14]:
df2['class'] = 'Economy'
In [15]:
display(df2.head())

df.head()
date airline ch_code num_code dep_time from time_taken stop arr_time to price class
0 2022-02-11 SpiceJet SG 8709 18:55 Delhi 2 non-stop 21:05 Mumbai 5953 Economy
1 2022-02-11 SpiceJet SG 8157 06:20 Delhi 2 non-stop 08:40 Mumbai 5953 Economy
2 2022-02-11 AirAsia I5 764 04:25 Delhi 2 non-stop 06:35 Mumbai 5956 Economy
3 2022-02-11 Vistara UK 995 10:20 Delhi 2 non-stop 12:35 Mumbai 5955 Economy
4 2022-02-11 Vistara UK 963 08:50 Delhi 2 non-stop 11:10 Mumbai 5955 Economy
Out[15]:
Unnamed: 0 airline flight source_city departure_time stops arrival_time destination_city class duration days_left price
0 0 SpiceJet SG-8709 Delhi Evening zero Night Mumbai Economy 2.17 1 5953
1 1 SpiceJet SG-8157 Delhi Early_Morning zero Morning Mumbai Economy 2.33 1 5953
2 2 AirAsia I5-764 Delhi Early_Morning zero Early_Morning Mumbai Economy 2.17 1 5956
3 3 Vistara UK-995 Delhi Morning zero Afternoon Mumbai Economy 2.25 1 5955
4 4 Vistara UK-963 Delhi Morning zero Morning Mumbai Economy 2.33 1 5955
In [16]:
new_df=pd.concat([df1, df2])
new_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 300261 entries, 0 to 206773
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   date        300261 non-null  datetime64[ns]
 1   airline     300261 non-null  object        
 2   ch_code     300261 non-null  object        
 3   num_code    300261 non-null  int64         
 4   dep_time    300261 non-null  object        
 5   from        300261 non-null  object        
 6   time_taken  300261 non-null  int64         
 7   stop        300261 non-null  object        
 8   arr_time    300261 non-null  object        
 9   to          300261 non-null  object        
 10  price       300261 non-null  int64         
 11  class       300261 non-null  object        
dtypes: datetime64[ns](1), int64(3), object(8)
memory usage: 29.8+ MB
In [17]:
new_df.date.value_counts()
Out[17]:
2022-03-07    6633
2022-02-28    6603
2022-03-21    6594
2022-03-14    6586
2022-03-08    6577
2022-03-06    6543
2022-03-01    6541
2022-03-15    6536
2022-03-13    6536
2022-03-22    6535
2022-03-10    6527
2022-03-23    6526
2022-03-20    6513
2022-03-12    6505
2022-03-24    6503
2022-03-02    6502
2022-03-04    6495
2022-03-18    6491
2022-03-03    6483
2022-03-19    6480
2022-03-25    6473
2022-03-26    6440
2022-02-27    6420
2022-02-21    6418
2022-03-16    6412
2022-03-05    6405
2022-02-23    6404
2022-03-11    6398
2022-02-22    6385
2022-03-09    6361
2022-02-24    6353
2022-02-25    6342
2022-03-27    6314
2022-03-17    6296
2022-02-26    6279
2022-03-28    6160
2022-03-31    6157
2022-03-30    6078
2022-03-29    6072
2022-02-20    5823
2022-02-18    5768
2022-02-16    5740
2022-02-17    5703
2022-02-19    5669
2022-02-15    5395
2022-02-14    5079
2022-02-13    4250
2022-02-12    4031
2022-02-11    1927
Name: date, dtype: int64
In [18]:
new_df.head()
Out[18]:
date airline ch_code num_code dep_time from time_taken stop arr_time to price class
0 2022-02-11 Air India AI 868 18:00 Delhi 2 non-stop 20:00 Mumbai 25612 Business
1 2022-02-11 Air India AI 624 19:00 Delhi 2 non-stop 21:15 Mumbai 25612 Business
2 2022-02-11 Air India AI 531 20:00 Delhi 24 1-stop 20:45 Mumbai 42220 Business
3 2022-02-11 Air India AI 839 21:25 Delhi 26 1-stop 23:55 Mumbai 44450 Business
4 2022-02-11 Air India AI 544 17:15 Delhi 6 1-stop 23:55 Mumbai 46690 Business
In [19]:
new_df.ch_code.value_counts()
Out[19]:
UK    127859
AI     80894
6E     43120
G8     23177
I5     16098
SG      9011
S5        61
2T        41
Name: ch_code, dtype: int64
In [20]:
new_df['num_code'].value_counts()
Out[20]:
808     3313
706     3235
772     2860
774     2808
720     2650
        ... 
1058       1
6613       1
405        1
9923       1
8913       1
Name: num_code, Length: 1255, dtype: int64
In [21]:
new_df.stop.value_counts()
Out[21]:
1-stop                      243603
non-stop                     36044
2+-stop                      13288
1-stopVia IXU                 1839
1-stopVia IDR                 1398
1-stopVia Patna                674
1-stopVia Indore               381
1-stopVia PAT                  354
1-stopVia MYQ                  321
1-stopVia Bhubaneswar          301
1-stopVia KLH                  284
1-stopVia JGB                  193
1-stopVia JRG                  175
1-stopVia STV                  169
1-stopVia BBI                  158
1-stopVia Delhi                153
1-stopVia Hyderabad            143
1-stopVia IXE                  120
1-stopVia Ranchi               114
1-stopVia Raipur               102
1-stopVia Chennai               77
1-stopVia Guwahati              55
1-stopVia Mysore                45
1-stopVia Mangalore             39
1-stopVia Mumbai                31
1-stopVia Kolkata               28
1-stopVia VTZ                   27
1-stopVia Nagpur                25
1-stopVia RPR                   22
1-stopVia NDC                   22
1-stopVia GOP                   16
1-stopVia Surat                 14
1-stopVia Lucknow               13
1-stopVia NAG                    9
1-stopVia Vishakhapatnam         8
1-stopVia Kolhapur               5
1-stopVia IXR                    4
1-stopVia GAU                    3
1-stopVia GAY                    3
1-stopVia HYD                    1
Name: stop, dtype: int64
In [22]:
df['flight'].value_counts()
Out[22]:
UK-706     3235
UK-772     2741
UK-720     2650
UK-836     2542
UK-822     2468
           ... 
6E-2939       1
SG-9923       1
6E-865        1
SG-8106       1
SG-8480       1
Name: flight, Length: 1561, dtype: int64
In [23]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        300153 non-null  int64  
 1   airline           300153 non-null  object 
 2   flight            300153 non-null  object 
 3   source_city       300153 non-null  object 
 4   departure_time    300153 non-null  object 
 5   stops             300153 non-null  object 
 6   arrival_time      300153 non-null  object 
 7   destination_city  300153 non-null  object 
 8   class             300153 non-null  object 
 9   duration          300153 non-null  float64
 10  days_left         300153 non-null  int64  
 11  price             300153 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 27.5+ MB
In [24]:
df3=df.drop(['Unnamed: 0'],axis=1)
df3.head()
Out[24]:
airline flight source_city departure_time stops arrival_time destination_city class duration days_left price
0 SpiceJet SG-8709 Delhi Evening zero Night Mumbai Economy 2.17 1 5953
1 SpiceJet SG-8157 Delhi Early_Morning zero Morning Mumbai Economy 2.33 1 5953
2 AirAsia I5-764 Delhi Early_Morning zero Early_Morning Mumbai Economy 2.17 1 5956
3 Vistara UK-995 Delhi Morning zero Afternoon Mumbai Economy 2.25 1 5955
4 Vistara UK-963 Delhi Morning zero Morning Mumbai Economy 2.33 1 5955
In [25]:
df3['days_left'] = df3['days_left'].astype(int)
In [26]:
df3.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300153 non-null  object 
 1   flight            300153 non-null  object 
 2   source_city       300153 non-null  object 
 3   departure_time    300153 non-null  object 
 4   stops             300153 non-null  object 
 5   arrival_time      300153 non-null  object 
 6   destination_city  300153 non-null  object 
 7   class             300153 non-null  object 
 8   duration          300153 non-null  float64
 9   days_left         300153 non-null  int64  
 10  price             300153 non-null  int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 25.2+ MB
In [27]:
df3.duration.value_counts().sort_values()
Out[27]:
41.50       1
39.25       1
35.67       1
36.25       1
41.58       1
         ... 
2.83     2323
2.08     2755
2.75     2879
2.25     4036
2.17     4242
Name: duration, Length: 476, dtype: int64
In [28]:
df3.describe().T
Out[28]:
count mean std min 25% 50% 75% max
duration 300153.0 12.221021 7.191997 0.83 6.83 11.25 16.17 49.83
days_left 300153.0 26.004751 13.561004 1.00 15.00 26.00 38.00 49.00
price 300153.0 20889.660523 22697.767366 1105.00 4783.00 7425.00 42521.00 123071.00
In [29]:
df.isnull().sum()
Out[29]:
Unnamed: 0          0
airline             0
flight              0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64
In [30]:
sns.heatmap(df1.isnull())
Out[30]:
<AxesSubplot:>
In [31]:
df3['source_city'].value_counts()
Out[31]:
Delhi        61343
Mumbai       60896
Bangalore    52061
Kolkata      46347
Hyderabad    40806
Chennai      38700
Name: source_city, dtype: int64
In [32]:
df3['destination_city'].value_counts()
Out[32]:
Mumbai       59097
Delhi        57360
Bangalore    51068
Kolkata      49534
Hyderabad    42726
Chennai      40368
Name: destination_city, dtype: int64
In [33]:
df3['stops'].unique()
Out[33]:
array(['zero', 'one', 'two_or_more'], dtype=object)
In [34]:
new_df.corr().style.background_gradient()
Out[34]:
num_code time_taken price
num_code 1.000000 -0.133619 -0.212865
time_taken -0.133619 1.000000 0.205324
price -0.212865 0.205324 1.000000

Data Visualization Questions¶

airline, class, stops for price¶

In [35]:
plt.figure(figsize=(20, 10))
sns.countplot(x='airline',hue='class' ,data=new_df).set(title='airline count based on class')
Out[35]:
[Text(0.5, 1.0, 'airline count based on class')]
In [36]:
# From graph we can see that Jet Vistara have the highest Price.
# Apart from the first Airline almost all are having similar median

# Airline vs Price
sns.catplot(y = "price", x = "airline", data = df3.sort_values("price", ascending = False), kind="boxen", height = 6, aspect = 3)
plt.show()
In [37]:
fig, axes = plt.subplots(1,2, figsize=(20,10))
sns.barplot(x='source_city', y='price', data=df3.sort_values('source_city', axis=0), ax=axes[0])
axes[0].set(title='Price variation based on source city')

sns.barplot(x='destination_city', y='price', data=df3.sort_values('destination_city', axis=0), ax=axes[1])
axes[1].set(title='Price variation based on destination city');
In [38]:
sns.relplot(col="airline", y="price", kind="line",x='stops', data=df3, col_wrap=2)
Out[38]:
<seaborn.axisgrid.FacetGrid at 0x7fc83e20c5e0>
📌   This explains a little bit about why Vsitara and Air India are the most expensive airlines, as we can see the one-stop condition is the highest price among the others.

source and destenation city respect to stops.¶

In [39]:
fig, axes = plt.subplots(1,2, figsize=(20,10))
sns.barplot(x='airline', y='price', data=df3.sort_values('airline', axis=0), ax=axes[0])
axes[0].set(title='Price variation based on airline')

sns.countplot(x='airline',hue='stops', data=df3.sort_values('stops', axis=0), ax=axes[1])
axes[1].set(title='Airline count based on stop');
In [40]:
plt.figure(figsize=(20, 10))
sns.countplot(x='source_city',hue='stops',data=df3).set(title='Source city count based on stops')
Out[40]:
[Text(0.5, 1.0, 'Source city count based on stops')]
In [41]:
plt.figure(figsize=(20, 10))
sns.countplot(x='destination_city',hue='stops',data=df3).set(title='Destination city count based on stops')
Out[41]:
[Text(0.5, 1.0, 'Destination city count based on stops')]
In [42]:
# Source vs Price

sns.catplot(y = "price", x = "source_city", data = df3.sort_values("price", ascending = False), kind="boxen", height = 4, aspect = 3)
plt.show()
📌   The one stop condition is the most common one according to the graphs and the hieghst price.

Arrival and departure time¶

In [43]:
plt.figure(figsize=(20, 10))
sns.countplot(x='arrival_time',hue='stops',data=df3).set(title='Arrival time count based on stops')
Out[43]:
[Text(0.5, 1.0, 'Arrival time count based on stops')]
In [44]:
plt.figure(figsize=(20, 10))
sns.countplot(x='departure_time',hue='stops',data=df3).set(title='Departure time count based on stops')
Out[44]:
[Text(0.5, 1.0, 'Departure time count based on stops')]
📌   most of flights with one stop happend on Night on the arrival time and Morning on the departure time.

Airways service¶

In [45]:
plt.figure(figsize=(20, 10))
sns.countplot(x='ch_code', data=new_df).set(title='Airways service count')
Out[45]:
[Text(0.5, 1.0, 'Airways service count')]
In [46]:
plt.figure(figsize=(20, 10))
sns.barplot(x='ch_code', y='price' ,data=new_df).set(title='Price variation based on airways service')
Out[46]:
[Text(0.5, 1.0, 'Price variation based on airways service')]
📌   based on the last two graphs the UK airways service is the most expensive airways and the most common one.
In [47]:
plt.figure(figsize=(20, 15))
sns.stripplot(x='airline',y='price',hue='days_left',data=df3).set(title='Price variation based on airline respect to days left')
Out[47]:
[Text(0.5, 1.0, 'Price variation based on airline respect to days left')]

Question) Does the ticket booking date affect the price?

The price changes depending on the date of booking the ticket, especially if it is booked a day or two before the flight date, the price will become higher. Also there is a drop in prices one day before the departure.

In [48]:
sns.relplot(col="departure_time", y="price", kind="line",x='arrival_time', data=df3, col_wrap=3)
Out[48]:
<seaborn.axisgrid.FacetGrid at 0x7fc861e43400>

Question) Do the times of the day affect the price?

The price changing based on the departure time and arrival time we can see some patterns such as when departure time is evening and Arrival time is night this is lowering the price.

In [49]:
sns.relplot(col="source_city", y="price", kind="line",x='destination_city', data=df3, col_wrap=3)
Out[49]:
<seaborn.axisgrid.FacetGrid at 0x7fc873d2beb0>

Question) Does the destination and source city affect the price?

Yes, and we can see from the above some information such Delhi price will be high if your destination is bangalore.

In [50]:
plt.figure(figsize=(20, 10))
sns.barplot(x='airline', y='price',hue='class' ,data=df3).set(title='Price variation based on airline respect to class')
Out[50]:
[Text(0.5, 1.0, 'Price variation based on airline respect to class')]
📌   Here we can see some vast variations in Economy class and Business class but this only happened in the two airlines.
In [51]:
plt.figure(figsize=(20, 10))
sns.barplot(x='airline',y='price',hue='stops',data=df3).set(title='Price variation based on airline respect to stops')
Out[51]:
[Text(0.5, 1.0, 'Price variation based on airline respect to stops')]

Question) Does the stops affect the price?

Yes, and we can see the price is much lower in flight that has zero stops, and for both Vistara and Air India the one stop is much higher price than other airlines.

In [52]:
fig, axes = plt.subplots(1,2, figsize=(20,10))

sns.barplot(x='airline', y='price',hue='class' ,data=df1, ax=axes[0])
axes[0].set(title='Business class price variation based on airline')
sns.barplot(x='airline', y='price',hue='class' ,data=df2, ax=axes[1])
axes[1].set(title='Economy class price variation based on airline')
Out[52]:
[Text(0.5, 1.0, 'Economy class price variation based on airline')]
📌   The most expensive airline is Vistara and the second is Air India in both class.

Time Series analysis¶

In [53]:
trace1 = go.Scatter(
                    x = df1.date,
                    y = df1.price,
                    mode = "lines",
                    name = "citations",
                    marker = dict(color = 'rgba(16, 112, 2, 0.8)'),
                    text= df1.to)

data = [trace1]
layout = dict(title = 'Price variation with dates destnation cities on business class',
              xaxis= dict(title= 'Price Variation',ticklen= 5,zeroline= False)
             )
fig = dict(data = data, layout = layout)
iplot(fig)
In [54]:
trace1 = go.Scatter(
                    x = df2.date,
                    y = df2.price,
                    mode = "lines",
                    name = "citations",
                    marker = dict(color = 'rgba(16, 112, 2, 0.8)'),
                    text= df2.to)

data = [trace1]
layout = dict(title = 'Price variation with dates destnation cities on economy class',
              xaxis= dict(title= 'Price Variation',ticklen= 5,zeroline= False)
             )
fig = dict(data = data, layout = layout)
iplot(fig)
In [55]:
trace1 = go.Scatter(
                    x = df1.date,
                    y = df1.price,
                    mode = "lines",
                    name = "citations",
                    marker = dict(color = 'rgba(16, 112, 2, 0.8)'),
                    text= df1['from'])

data = [trace1]
layout = dict(title = 'Price variation with dates departure cities on business class',
              xaxis= dict(title= 'Price Variation',ticklen= 5,zeroline= False)
             )
fig = dict(data = data, layout = layout)
iplot(fig)
In [56]:
trace1 = go.Scatter(
                    x = df2.date,
                    y = df2.price,
                    mode = "lines",
                    name = "citations",
                    marker = dict(color = 'rgba(16, 112, 2, 0.8)'),
                    text= df2['from'])

data = [trace1]
layout = dict(title = 'Price variation with dates departure cities on economy class',
              xaxis= dict(title= 'Price Variation',ticklen= 5,zeroline= False)
             )
fig = dict(data = data, layout = layout)
iplot(fig)
In [57]:
from matplotlib.dates import DateFormatter

fig, ax = plt.subplots(figsize=(20, 10))
sns.lineplot(x='date', y='price',hue='to' ,data=df1, ci=None).set(title="date and time respect to destantion city in business class")
date_form = DateFormatter("%m-%d")
ax.xaxis.set_major_formatter(date_form)
📌   Kolkata is the highest destination city price among the others in business class.
In [58]:
from matplotlib.dates import DateFormatter

fig, ax = plt.subplots(figsize=(20, 10))
sns.lineplot(x='date', y='price',hue='from' ,data=df1, ci=None).set(title="date and time respect to depature city in business class")
date_form = DateFormatter("%m-%d")
ax.xaxis.set_major_formatter(date_form)
📌   Kolkata is the highest departure city price among the others in business class.
In [59]:
from matplotlib.dates import DateFormatter

fig, ax = plt.subplots(figsize=(20, 10))
sns.lineplot(x='date', y='price',hue='to' ,data=df2, ci=None).set(title="date and time respect to destination city in economy class")
date_form = DateFormatter("%m-%d")
ax.xaxis.set_major_formatter(date_form)
📌   Kolkata is the highest departure city price among the others in economy class, and there are maybe some seasons on dates before 02-22 because the prices are much higher on this dates.
In [60]:
from matplotlib.dates import DateFormatter

fig, ax = plt.subplots(figsize=(20, 10))
sns.lineplot(x='date', y='price',hue='from' ,data=df2, ci=None).set(title="date and time respect to departure city in business class")
date_form = DateFormatter("%m-%d")
ax.xaxis.set_major_formatter(date_form)
📌   Kolkata is the highest destination city price among the others in economy class, and there are maybe some seasons on dates before 02-22 because the prices are much higher on this dates.
In [61]:
dataframe = new_df.set_index("date")
dataframe
Out[61]:
airline ch_code num_code dep_time from time_taken stop arr_time to price class
date
2022-02-11 Air India AI 868 18:00 Delhi 2 non-stop 20:00 Mumbai 25612 Business
2022-02-11 Air India AI 624 19:00 Delhi 2 non-stop 21:15 Mumbai 25612 Business
2022-02-11 Air India AI 531 20:00 Delhi 24 1-stop 20:45 Mumbai 42220 Business
2022-02-11 Air India AI 839 21:25 Delhi 26 1-stop 23:55 Mumbai 44450 Business
2022-02-11 Air India AI 544 17:15 Delhi 6 1-stop 23:55 Mumbai 46690 Business
... ... ... ... ... ... ... ... ... ... ... ...
2022-03-31 Vistara UK 832 07:05 Chennai 13 1-stop 20:55 Hyderabad 7697 Economy
2022-03-31 Vistara UK 832 07:05 Chennai 13 1-stop 20:55 Hyderabad 7709 Economy
2022-03-31 Vistara UK 826 12:30 Chennai 20 1-stop 09:05 Hyderabad 8640 Economy
2022-03-31 Vistara UK 822 09:45 Chennai 23 1-stop 09:05 Hyderabad 8640 Economy
2022-03-31 Vistara UK 824 20:30 Chennai 24 1-stop 20:55 Hyderabad 8640 Economy

300261 rows × 11 columns

📌   The most expensive cities are Chennai and Kolkata.
In [62]:
df3['price_per_minute'] = df3['price']/df3['duration']*60
In [63]:
plt.figure(figsize=(20,8))

plt.subplot(1,2,1)
plt.title('Price Distribution Plot')
sns.kdeplot(df3.price)

plt.subplot(1,2,2)
plt.title('Price per minute Distribution Plot')
sns.kdeplot(df3.price_per_minute)

plt.show()
📌   The price per minute is skewed at the beginning to the lowest prices.

Modeling¶

In [64]:
df3.columns
Out[64]:
Index(['airline', 'flight', 'source_city', 'departure_time', 'stops',
       'arrival_time', 'destination_city', 'class', 'duration', 'days_left',
       'price', 'price_per_minute'],
      dtype='object')
In [65]:
df3.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300153 non-null  object 
 1   flight            300153 non-null  object 
 2   source_city       300153 non-null  object 
 3   departure_time    300153 non-null  object 
 4   stops             300153 non-null  object 
 5   arrival_time      300153 non-null  object 
 6   destination_city  300153 non-null  object 
 7   class             300153 non-null  object 
 8   duration          300153 non-null  float64
 9   days_left         300153 non-null  int64  
 10  price             300153 non-null  int64  
 11  price_per_minute  300153 non-null  float64
dtypes: float64(2), int64(2), object(8)
memory usage: 27.5+ MB
In [66]:
df3['departure_time'].value_counts()
Out[66]:
Morning          71146
Early_Morning    66790
Evening          65102
Night            48015
Afternoon        47794
Late_Night        1306
Name: departure_time, dtype: int64
In [67]:
df3['arrival_time'].value_counts()
Out[67]:
Night            91538
Evening          78323
Morning          62735
Afternoon        38139
Early_Morning    15417
Late_Night       14001
Name: arrival_time, dtype: int64
In [68]:
df3['source_city'].value_counts()
Out[68]:
Delhi        61343
Mumbai       60896
Bangalore    52061
Kolkata      46347
Hyderabad    40806
Chennai      38700
Name: source_city, dtype: int64
In [69]:
df3['stops'].value_counts()
Out[69]:
one            250863
zero            36004
two_or_more     13286
Name: stops, dtype: int64
In [70]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,MinMaxScaler
In [71]:
# As airline is Nominal Categorical data we will perform OneHotEncoding

airline = df3[["airline"]]

airline = pd.get_dummies(airline, drop_first= True)

airline.head()
Out[71]:
airline_Air_India airline_GO_FIRST airline_Indigo airline_SpiceJet airline_Vistara
0 0 0 0 1 0
1 0 0 0 1 0
2 0 0 0 0 0
3 0 0 0 0 1
4 0 0 0 0 1
In [72]:
# As source city is Nominal Categorical data we will perform OneHotEncoding

source = df3[["source_city"]]

source = pd.get_dummies(source, drop_first= True)

source.head()
Out[72]:
source_city_Chennai source_city_Delhi source_city_Hyderabad source_city_Kolkata source_city_Mumbai
0 0 1 0 0 0
1 0 1 0 0 0
2 0 1 0 0 0
3 0 1 0 0 0
4 0 1 0 0 0
In [73]:
# As departure time is Nominal Categorical data we will perform OneHotEncoding

departure = df3[["departure_time"]]

departure = pd.get_dummies(departure, drop_first= True)

departure.head()
Out[73]:
departure_time_Early_Morning departure_time_Evening departure_time_Late_Night departure_time_Morning departure_time_Night
0 0 1 0 0 0
1 1 0 0 0 0
2 1 0 0 0 0
3 0 0 0 1 0
4 0 0 0 1 0
In [74]:
# As departure time is Nominal Categorical data we will perform OneHotEncoding

arrival = df3[["arrival_time"]]

arrival = pd.get_dummies(arrival, drop_first= True)

arrival.head()
Out[74]:
arrival_time_Early_Morning arrival_time_Evening arrival_time_Late_Night arrival_time_Morning arrival_time_Night
0 0 0 0 0 1
1 0 0 0 1 0
2 1 0 0 0 0
3 0 0 0 0 0
4 0 0 0 1 0
In [75]:
# As destenation city is Nominal Categorical data we will perform OneHotEncoding

destination = df3[["destination_city"]]

destination = pd.get_dummies(destination, drop_first= True)

destination.head()
Out[75]:
destination_city_Chennai destination_city_Delhi destination_city_Hyderabad destination_city_Kolkata destination_city_Mumbai
0 0 0 0 0 1
1 0 0 0 0 1
2 0 0 0 0 1
3 0 0 0 0 1
4 0 0 0 0 1
In [76]:
df3.replace({"zero": 0, "one": 1, "two_or_more": 2}, inplace = True)
In [77]:
df3.replace({"Economy": 0, "Business": 1}, inplace = True)
In [78]:
df4 = pd.concat([df3, airline, source, departure, arrival, destination], axis = 1)
In [79]:
df4.head()
Out[79]:
airline flight source_city departure_time stops arrival_time destination_city class duration days_left ... arrival_time_Early_Morning arrival_time_Evening arrival_time_Late_Night arrival_time_Morning arrival_time_Night destination_city_Chennai destination_city_Delhi destination_city_Hyderabad destination_city_Kolkata destination_city_Mumbai
0 SpiceJet SG-8709 Delhi Evening 0 Night Mumbai 0 2.17 1 ... 0 0 0 0 1 0 0 0 0 1
1 SpiceJet SG-8157 Delhi Early_Morning 0 Morning Mumbai 0 2.33 1 ... 0 0 0 1 0 0 0 0 0 1
2 AirAsia I5-764 Delhi Early_Morning 0 Early_Morning Mumbai 0 2.17 1 ... 1 0 0 0 0 0 0 0 0 1
3 Vistara UK-995 Delhi Morning 0 Afternoon Mumbai 0 2.25 1 ... 0 0 0 0 0 0 0 0 0 1
4 Vistara UK-963 Delhi Morning 0 Morning Mumbai 0 2.33 1 ... 0 0 0 1 0 0 0 0 0 1

5 rows × 37 columns

In [80]:
df4.drop(["airline", "flight", 'source_city', "departure_time", 'arrival_time', 'destination_city', 'price_per_minute'], axis = 1, inplace = True)
In [81]:
df4.head()
Out[81]:
stops class duration days_left price airline_Air_India airline_GO_FIRST airline_Indigo airline_SpiceJet airline_Vistara ... arrival_time_Early_Morning arrival_time_Evening arrival_time_Late_Night arrival_time_Morning arrival_time_Night destination_city_Chennai destination_city_Delhi destination_city_Hyderabad destination_city_Kolkata destination_city_Mumbai
0 0 0 2.17 1 5953 0 0 0 1 0 ... 0 0 0 0 1 0 0 0 0 1
1 0 0 2.33 1 5953 0 0 0 1 0 ... 0 0 0 1 0 0 0 0 0 1
2 0 0 2.17 1 5956 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 1
3 0 0 2.25 1 5955 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 1
4 0 0 2.33 1 5955 0 0 0 0 1 ... 0 0 0 1 0 0 0 0 0 1

5 rows × 30 columns

In [82]:
X = df4.drop(['price'], axis=1)
y = df4['price']
In [83]:
X
Out[83]:
stops class duration days_left airline_Air_India airline_GO_FIRST airline_Indigo airline_SpiceJet airline_Vistara source_city_Chennai ... arrival_time_Early_Morning arrival_time_Evening arrival_time_Late_Night arrival_time_Morning arrival_time_Night destination_city_Chennai destination_city_Delhi destination_city_Hyderabad destination_city_Kolkata destination_city_Mumbai
0 0 0 2.17 1 0 0 0 1 0 0 ... 0 0 0 0 1 0 0 0 0 1
1 0 0 2.33 1 0 0 0 1 0 0 ... 0 0 0 1 0 0 0 0 0 1
2 0 0 2.17 1 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 1
3 0 0 2.25 1 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 1
4 0 0 2.33 1 0 0 0 0 1 0 ... 0 0 0 1 0 0 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
300148 1 1 10.08 49 0 0 0 0 1 1 ... 0 1 0 0 0 0 0 1 0 0
300149 1 1 10.42 49 0 0 0 0 1 1 ... 0 0 0 0 1 0 0 1 0 0
300150 1 1 13.83 49 0 0 0 0 1 1 ... 0 0 0 0 1 0 0 1 0 0
300151 1 1 10.00 49 0 0 0 0 1 1 ... 0 1 0 0 0 0 0 1 0 0
300152 1 1 10.08 49 0 0 0 0 1 1 ... 0 1 0 0 0 0 0 1 0 0

300153 rows × 29 columns

In [84]:
# Important feature using ExtraTreesRegressor

from sklearn.ensemble import ExtraTreesRegressor
selection = ExtraTreesRegressor()
selection.fit(X, y)
Out[84]:
ExtraTreesRegressor()
In [85]:
print(selection.feature_importances_)
[3.37760148e-02 8.80372309e-01 2.30366296e-02 1.87492358e-02
 5.43715482e-03 1.83641420e-04 1.67105875e-04 1.37560858e-04
 5.51644254e-03 1.04014608e-03 4.21120594e-03 1.49153286e-03
 2.01313766e-03 2.36546597e-03 1.09602817e-03 1.47663936e-03
 5.34244548e-05 1.34169382e-03 9.94190954e-04 9.38228052e-04
 1.49662651e-03 2.22150374e-04 1.06744649e-03 1.75736200e-03
 1.04105638e-03 4.10354975e-03 1.92193639e-03 1.97151964e-03
 2.02056403e-03]
In [86]:
#plot graph of feature importances for better visualization

plt.figure(figsize = (12,8))
feat_importances = pd.Series(selection.feature_importances_, index=X.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.show()
In [87]:
#cat_columns = ['airline','source_city','departure_time','stops','arrival_time','destination_city','class']
#num_columns = ['duration','days_left']
In [88]:
#encoder = OrdinalEncoder().fit_transform(df3[cat_columns])
#encoder = pd.DataFrame(encoder,columns = cat_columns)
In [89]:
#X = pd.concat([encoder,df3[num_columns]],axis=1)
#y = df3['price']
In [90]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state= 42)

Linear Regression Model¶

In [91]:
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(y_pred)
[ 3151.37494602 54895.59810797  9856.93848654 ...  4522.1445811
  -397.39426783 57811.06252282]
In [92]:
print("R2 Score: ",r2_score(y_test,y_pred))
print("Mean Squared Error: ",mean_squared_error(y_test, y_pred))
print('Mean Absolute Error', mean_absolute_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
R2 Score:  0.9099031138535913
Mean Squared Error:  46443347.71222658
Mean Absolute Error 4500.712501541862
Root Mean Squared Error: 6814.935635222579

Gradient Boosting Regressor¶

In [93]:
from sklearn.ensemble import GradientBoostingRegressor

gbre = GradientBoostingRegressor()
gbre.fit(X_train, y_train)
y_pred = gbre.predict(X_test)
print(y_pred)
[ 5099.25271303 62513.40546999  8045.78944625 ...  5314.91661395
  2687.44731556 65815.10665754]
In [94]:
print("R2 Score: ",r2_score(y_test,y_pred))
print("Mean Squared Error: ",mean_squared_error(y_test, y_pred))
print('Mean Absolute Error', mean_absolute_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
R2 Score:  0.9514799890396428
Mean Squared Error:  25011205.563431542
Mean Absolute Error 2972.056919745379
Root Mean Squared Error: 5001.120430806635

Random Forest Regressor¶

In [95]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print(y_pred)
[ 7349.2  70802.84  6195.   ...  6496.84  3757.18 69950.71]
In [96]:
print("R2 Score: ",r2_score(y_test,y_pred))
print("Mean Squared Error: ",mean_squared_error(y_test, y_pred))
print('Mean Absolute Error', mean_absolute_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
R2 Score:  0.9848578081522366
Mean Squared Error:  7805531.48050116
Mean Absolute Error 1083.9322441045008
Root Mean Squared Error: 2793.8381271113685
In [97]:
plt.figure(figsize = (8,8))
sns.kdeplot(y_test-y_pred)
plt.show()
In [98]:
plt.figure(figsize = (8,8))
plt.scatter(y_test, y_pred, alpha = 0.5)
plt.xlabel("y_test")
plt.ylabel("y_pred")
plt.show()
📌   Random Forest Regressor is the highest model based on the metrics.

Saving the model¶

In [102]:
import pickle
# open a file, where you ant to store the data
file = open('flight_rf.pkl', 'wb')

# dump information to that file
pickle.dump(regr, file)
In [103]:
model = open('flight_rf.pkl','rb')
forest = pickle.load(model)
In [104]:
y_prediction = forest.predict(X_test)
In [105]:
r2_score(y_test, y_prediction)
Out[105]:
0.9848578081522366